import numpy as np
import pandas as pd

## Plotly plotting support
# import plotly.plotly as py

import plotly.offline as py
import plotly.figure_factory as ff
py.init_notebook_mode()

import cufflinks as cf
cf.go_offline() # required to use plotly offline (no account required).

import plotly.graph_objs as go

np.random.seed(42)

n = 75      # Number of records 

noise = 3.5  # Noise in observations (we wouldn't know this in real life)
m = 1.5      # The true slope (we wouldn't know this in real life) 
b = 10.0      # The true intercept (we wouldn't know this in real life)

# Make the data --------------------------
X = np.random.rand(n) * 20. - 10.
X.sort()
# The Y values are created using the secret model 
#      (We wouldn't have this in real-life either)
Y = m * X + b * np.sin(X) + np.random.randn(n) * noise 
Y[20] = 7
Y[23] = -18
Y[30] = 20
Y[55] = -18
Y += 20
# Y[40] = -20
data = pd.DataFrame(dict(X = X, Y = Y))
del X, Y

data.to_csv("toy_training_data.csv", index=False)

raw_data = go.Scatter(name = "Data", x = data['X'], y = data['Y'], mode = 'markers')
py.iplot([raw_data])

np.random.seed(37)

n = 50      # Number of records 

# Make the data --------------------------
X = np.random.rand(n) * 20. - 10.
X.sort()
# The Y values are created using the secret model 
#      (We wouldn't have this in real-life either)
Y = m * X + b * np.sin(X) + np.random.randn(n) * noise
Y[10] = 7
Y[23] = -7
Y[1] = -18

Y += 20 
test_data = pd.DataFrame(dict(X = X, Y = Y))
del X, Y

test_data.to_csv("toy_test_data.csv", index=False)

np.random.seed(42)

flavor_prices = {
    "Vanilla": 0.75,
    "Chocolate": 0.8,
    "Strawberry": 0.5
}

topping_prices = {
    "Sprinkles": 0.3,
    "Fruit": 1.0,
    "Chocolate": 0.5,
    "None": 0.0
}

n = 200
weights = np.random.rand(n) * 4 + 1.
flavors = np.random.choice(list(flavor_prices.keys()), n)
toppings = np.random.choice(list(topping_prices.keys()), n)
price = np.array([ "%.2f" % (w * flavor_prices[f] + topping_prices[t])
    for (w,f,t) in zip(weights, flavors, toppings)]).astype('float')

icecream = pd.DataFrame({
    "mass": np.round(weights,1),
    "flavor": flavors,
    "topping": toppings,
    "price": price
}, columns=["flavor", "topping", "mass", "price"])

icecream.iloc[0:150,:].to_csv("icecream_train.csv", index=False)
icecream.iloc[150:,:].to_csv("icecream_test.csv", index=False)

icecream.head()

d = pd.get_dummies(df)

from sklearn.feature_extraction import DictVectorizer

flavor_enc = DictVectorizer()
flavor_enc.fit(icecream[["flavor"]].to_dict(orient='records'))
onehot_flavor = flavor_enc.transform(icecream[["flavor"]].to_dict(orient='records'))

topping_enc = DictVectorizer()
topping_enc.fit(icecream[["topping"]].to_dict(orient='records'))
onehot_topping = topping_enc.transform(icecream[["topping"]].to_dict(orient='records'))

import scipy as sp
f1 = sp.sparse.spdiags(icecream['weight'].values, 0, n, n) * onehot_flavor
phi = sp.sparse.hstack((f1, onehot_topping))

from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(phi, icecream['price'])
yhat = reg.predict(phi)

np.round(reg.coef_,2)

array([ 0.8 ,  0.5 ,  0.75,  0.5 ,  1.  ,  0.  ,  0.3 ])

q = yhat - icecream['price']

-0.043452525523396179

import plotly.figure_factory as ff

py.iplot(ff.create_distplot([yhat - icecream['price']], group_labels=["residuals"], bin_size=0.001))

onehot_flavor * icecream['mass'].values[:, np.newaxis]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-4f236c9fa5ad> in <module>()
----> 1 onehot_flavor * icecream['mass'].values[:, np.newaxis]

NameError: name 'onehot_flavor' is not defined

len(icecream['weight'].values)

500

import scipy.sparse

scipy.sparse.spdiags(icecream['weight'].values, 0, n, n)  * onehot_flavor

<500x3 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

	flavor	topping	weight	price
0	Chocolate	Chocolate	3.6	3.35
1	Chocolate	Sprinkles	5.0	4.27
2	Chocolate	None	3.3	2.68
3	Vanilla	Fruit	3.7	3.74
4	Vanilla	Chocolate	2.2	2.12